pacman::p_load(ggplot2, dplyr, RMySQL, lubridate, psych, tidyr, plotly)

## Create a database connection 
con = dbConnect(MySQL(), user='deepAnalytics', password='Sqltask1234!', dbname='dataanalytics2018', host='data-analytics-2018.cbrosir2cswx.us-east-1.rds.amazonaws.com')

## List the tables contained in the database 
dbListTables(con)
## [1] "iris"    "yr_2006" "yr_2007" "yr_2008" "yr_2009" "yr_2010"
#IRIS
#List the attributes in the table Iris
dbListFields(con,'iris')
## [1] "id"            "SepalLengthCm" "SepalWidthCm"  "PetalLengthCm"
## [5] "PetalWidthCm"  "Species"
#Query from Iris -> Select all
irisALL <- dbGetQuery(con, "SELECT * FROM iris")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported
## as numeric
#Query from Iris -> Select two attributes
irisSELECT <- dbGetQuery(con, "SELECT SepalLengthCm, SepalWidthCm FROM iris")


#YR_2006

#List the attributes from yr_2006 table
dbListFields(con,'yr_2006')
##  [1] "id"                    "Date"                 
##  [3] "Time"                  "Global_active_power"  
##  [5] "Global_reactive_power" "Global_intensity"     
##  [7] "Voltage"               "Sub_metering_1"       
##  [9] "Sub_metering_2"        "Sub_metering_3"
#Query
#es posible hacer algo asi? vars <- c('Date', 'Time', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3')
yr_2006 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2006")
yr_2007 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2007")
yr_2008 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2008")
yr_2009 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2009")
yr_2010 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2010")


str(yr_2006)
## 'data.frame':    21992 obs. of  5 variables:
##  $ Date          : chr  "2006-12-16" "2006-12-16" "2006-12-16" "2006-12-16" ...
##  $ Time          : chr  "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
##  $ Sub_metering_1: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2: num  1 1 2 1 1 2 1 1 1 2 ...
##  $ Sub_metering_3: num  17 16 17 17 17 17 17 17 17 16 ...
str(yr_2007)
## 'data.frame':    521669 obs. of  5 variables:
##  $ Date          : chr  "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
##  $ Time          : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Sub_metering_1: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3: num  0 0 0 0 0 0 0 0 0 0 ...
str(yr_2008)
## 'data.frame':    526905 obs. of  5 variables:
##  $ Date          : chr  "2008-01-01" "2008-01-01" "2008-01-01" "2008-01-01" ...
##  $ Time          : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Sub_metering_1: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3: num  18 18 18 18 18 17 18 18 18 18 ...
str(yr_2009)
## 'data.frame':    521320 obs. of  5 variables:
##  $ Date          : chr  "2009-01-01" "2009-01-01" "2009-01-01" "2009-01-01" ...
##  $ Time          : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Sub_metering_1: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3: num  0 0 0 0 0 0 0 0 0 0 ...
str(yr_2010)
## 'data.frame':    457394 obs. of  5 variables:
##  $ Date          : chr  "2010-01-01" "2010-01-01" "2010-01-01" "2010-01-01" ...
##  $ Time          : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Sub_metering_1: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3: num  18 18 19 18 18 19 18 18 19 18 ...
summary(yr_2006)
##      Date               Time           Sub_metering_1   Sub_metering_2  
##  Length:21992       Length:21992       Min.   : 0.000   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.: 0.000   1st Qu.: 0.000  
##  Mode  :character   Mode  :character   Median : 0.000   Median : 0.000  
##                                        Mean   : 1.249   Mean   : 2.215  
##                                        3rd Qu.: 0.000   3rd Qu.: 1.000  
##                                        Max.   :77.000   Max.   :74.000  
##  Sub_metering_3 
##  Min.   : 0.00  
##  1st Qu.: 0.00  
##  Median : 0.00  
##  Mean   : 7.41  
##  3rd Qu.:17.00  
##  Max.   :20.00
summary(yr_2007)
##      Date               Time           Sub_metering_1   Sub_metering_2  
##  Length:521669      Length:521669      Min.   : 0.000   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.: 0.000   1st Qu.: 0.000  
##  Mode  :character   Mode  :character   Median : 0.000   Median : 0.000  
##                                        Mean   : 1.232   Mean   : 1.638  
##                                        3rd Qu.: 0.000   3rd Qu.: 1.000  
##                                        Max.   :78.000   Max.   :78.000  
##  Sub_metering_3  
##  Min.   : 0.000  
##  1st Qu.: 0.000  
##  Median : 0.000  
##  Mean   : 5.795  
##  3rd Qu.:17.000  
##  Max.   :20.000
summary(yr_2008)
##      Date               Time           Sub_metering_1  Sub_metering_2  
##  Length:526905      Length:526905      Min.   : 0.00   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.: 0.00   1st Qu.: 0.000  
##  Mode  :character   Mode  :character   Median : 0.00   Median : 0.000  
##                                        Mean   : 1.11   Mean   : 1.256  
##                                        3rd Qu.: 0.00   3rd Qu.: 1.000  
##                                        Max.   :80.00   Max.   :76.000  
##  Sub_metering_3  
##  Min.   : 0.000  
##  1st Qu.: 0.000  
##  Median : 1.000  
##  Mean   : 6.034  
##  3rd Qu.:17.000  
##  Max.   :31.000
summary(yr_2009)
##      Date               Time           Sub_metering_1   Sub_metering_2  
##  Length:521320      Length:521320      Min.   : 0.000   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.: 0.000   1st Qu.: 0.000  
##  Mode  :character   Mode  :character   Median : 0.000   Median : 0.000  
##                                        Mean   : 1.137   Mean   : 1.136  
##                                        3rd Qu.: 0.000   3rd Qu.: 1.000  
##                                        Max.   :82.000   Max.   :77.000  
##  Sub_metering_3  
##  Min.   : 0.000  
##  1st Qu.: 0.000  
##  Median : 1.000  
##  Mean   : 6.823  
##  3rd Qu.:18.000  
##  Max.   :31.000
summary(yr_2010)
##      Date               Time           Sub_metering_1    Sub_metering_2  
##  Length:457394      Length:457394      Min.   : 0.0000   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.: 0.0000   1st Qu.: 0.000  
##  Mode  :character   Mode  :character   Median : 0.0000   Median : 0.000  
##                                        Mean   : 0.9875   Mean   : 1.102  
##                                        3rd Qu.: 0.0000   3rd Qu.: 1.000  
##                                        Max.   :88.0000   Max.   :80.000  
##  Sub_metering_3  
##  Min.   : 0.000  
##  1st Qu.: 1.000  
##  Median : 1.000  
##  Mean   : 7.244  
##  3rd Qu.:18.000  
##  Max.   :31.000
head(yr_2006)
##         Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2006-12-16 17:24:00              0              1             17
## 2 2006-12-16 17:25:00              0              1             16
## 3 2006-12-16 17:26:00              0              2             17
## 4 2006-12-16 17:27:00              0              1             17
## 5 2006-12-16 17:28:00              0              1             17
## 6 2006-12-16 17:29:00              0              2             17
head(yr_2007)
##         Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2007-01-01 00:00:00              0              0              0
## 2 2007-01-01 00:01:00              0              0              0
## 3 2007-01-01 00:02:00              0              0              0
## 4 2007-01-01 00:03:00              0              0              0
## 5 2007-01-01 00:04:00              0              0              0
## 6 2007-01-01 00:05:00              0              0              0
head(yr_2008)
##         Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2008-01-01 00:00:00              0              0             18
## 2 2008-01-01 00:01:00              0              0             18
## 3 2008-01-01 00:02:00              0              0             18
## 4 2008-01-01 00:03:00              0              0             18
## 5 2008-01-01 00:04:00              0              0             18
## 6 2008-01-01 00:05:00              0              0             17
head(yr_2009)
##         Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2009-01-01 00:00:00              0              0              0
## 2 2009-01-01 00:01:00              0              0              0
## 3 2009-01-01 00:02:00              0              0              0
## 4 2009-01-01 00:03:00              0              0              0
## 5 2009-01-01 00:04:00              0              0              0
## 6 2009-01-01 00:05:00              0              0              0
head(yr_2010)
##         Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2010-01-01 00:00:00              0              0             18
## 2 2010-01-01 00:01:00              0              0             18
## 3 2010-01-01 00:02:00              0              0             19
## 4 2010-01-01 00:03:00              0              0             18
## 5 2010-01-01 00:04:00              0              0             18
## 6 2010-01-01 00:05:00              0              0             19
tail(yr_2006)
##             Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 21987 2006-12-31 23:54:00              0              0              0
## 21988 2006-12-31 23:55:00              0              0              0
## 21989 2006-12-31 23:56:00              0              0              0
## 21990 2006-12-31 23:57:00              0              0              0
## 21991 2006-12-31 23:58:00              0              0              0
## 21992 2006-12-31 23:59:00              0              0              0
tail(yr_2007)
##              Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521664 2007-12-31 23:54:00              0              0             18
## 521665 2007-12-31 23:55:00              0              0             18
## 521666 2007-12-31 23:56:00              0              0             18
## 521667 2007-12-31 23:57:00              0              0             18
## 521668 2007-12-31 23:58:00              0              0             18
## 521669 2007-12-31 23:59:00              0              0             18
tail(yr_2008)
##              Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 526900 2008-12-31 23:54:00              0              0              0
## 526901 2008-12-31 23:55:00              0              0              0
## 526902 2008-12-31 23:56:00              0              0              0
## 526903 2008-12-31 23:57:00              0              0              0
## 526904 2008-12-31 23:58:00              0              0              0
## 526905 2008-12-31 23:59:00              0              0              0
tail(yr_2009)
##              Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521315 2009-12-31 23:54:00              0              0             18
## 521316 2009-12-31 23:55:00              0              0             18
## 521317 2009-12-31 23:56:00              0              0             19
## 521318 2009-12-31 23:57:00              0              0             18
## 521319 2009-12-31 23:58:00              0              0             18
## 521320 2009-12-31 23:59:00              0              0             19
tail(yr_2010)
##              Date     Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 457389 2010-11-26 20:57:00              0              0              0
## 457390 2010-11-26 20:58:00              0              0              0
## 457391 2010-11-26 20:59:00              0              0              0
## 457392 2010-11-26 21:00:00              0              0              0
## 457393 2010-11-26 21:01:00              0              0              0
## 457394 2010-11-26 21:02:00              0              0              0
#Combine tables into one dataframe
All_Years <- bind_rows(yr_2007, yr_2008, yr_2009, yr_2010)

summary(All_Years)
##      Date               Time           Sub_metering_1   Sub_metering_2  
##  Length:2027288     Length:2027288     Min.   : 0.000   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.: 0.000   1st Qu.: 0.000  
##  Mode  :character   Mode  :character   Median : 0.000   Median : 0.000  
##                                        Mean   : 1.121   Mean   : 1.289  
##                                        3rd Qu.: 0.000   3rd Qu.: 1.000  
##                                        Max.   :88.000   Max.   :80.000  
##  Sub_metering_3  
##  Min.   : 0.000  
##  1st Qu.: 0.000  
##  Median : 1.000  
##  Mean   : 6.448  
##  3rd Qu.:17.000  
##  Max.   :31.000
#PREPROCESSING

## Combine Date and Time attribute values in a new attribute column with Paste
dataByYears <-cbind(All_Years,paste(All_Years$Date,All_Years$Time), stringsAsFactors=FALSE)

## Give the new attribute in the 6th column a header name change the name
colnames(dataByYears)[6] <-"DateTime"

## And move the DateTime attribute within the dataset
dataByYears <- dataByYears[,c(ncol(dataByYears), 1:(ncol(dataByYears)-1))]
head(dataByYears)
##              DateTime       Date     Time Sub_metering_1 Sub_metering_2
## 1 2007-01-01 00:00:00 2007-01-01 00:00:00              0              0
## 2 2007-01-01 00:01:00 2007-01-01 00:01:00              0              0
## 3 2007-01-01 00:02:00 2007-01-01 00:02:00              0              0
## 4 2007-01-01 00:03:00 2007-01-01 00:03:00              0              0
## 5 2007-01-01 00:04:00 2007-01-01 00:04:00              0              0
## 6 2007-01-01 00:05:00 2007-01-01 00:05:00              0              0
##   Sub_metering_3
## 1              0
## 2              0
## 3              0
## 4              0
## 5              0
## 6              0
## Convert DateTime from POSIXlt to POSIXct 
dataByYears$DateTime <- as.POSIXct(dataByYears$DateTime, "%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown
## timezone '%Y/%m/%d %H:%M:%S'
## Add the time zone
attr(dataByYears$DateTime, "tzone") <- "GMT+0"

## Inspect the data types
str(dataByYears)
## 'data.frame':    2027288 obs. of  6 variables:
##  $ DateTime      : POSIXct, format: "2007-01-01 00:00:00" "2007-01-01 00:01:00" ...
##  $ Date          : chr  "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
##  $ Time          : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Sub_metering_1: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3: num  0 0 0 0 0 0 0 0 0 0 ...
#Separate daytime in different attributes
dataByYears$year <- year(dataByYears$DateTime)
dataByYears$month <- month(dataByYears$DateTime)
dataByYears$weekday <- weekdays(dataByYears$DateTime)
dataByYears$day <- day(dataByYears$DateTime)
dataByYears$hour <- hour(dataByYears$DateTime)
dataByYears$minute <- minute(dataByYears$DateTime)



#EXPLORATION OF THE DATA. First approach
 summary(dataByYears) 
##     DateTime                       Date               Time          
##  Min.   :2007-01-01 00:00:00   Length:2027288     Length:2027288    
##  1st Qu.:2007-12-21 16:32:45   Class :character   Class :character  
##  Median :2008-12-07 16:38:30   Mode  :character   Mode  :character  
##  Mean   :2008-12-09 17:30:05                                        
##  3rd Qu.:2009-11-27 16:09:15                                        
##  Max.   :2010-11-26 21:02:00                                        
##  Sub_metering_1   Sub_metering_2   Sub_metering_3        year     
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :2007  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.:2007  
##  Median : 0.000   Median : 0.000   Median : 1.000   Median :2008  
##  Mean   : 1.121   Mean   : 1.289   Mean   : 6.448   Mean   :2008  
##  3rd Qu.: 0.000   3rd Qu.: 1.000   3rd Qu.:17.000   3rd Qu.:2009  
##  Max.   :88.000   Max.   :80.000   Max.   :31.000   Max.   :2010  
##      month          weekday               day             hour     
##  Min.   : 1.000   Length:2027288     Min.   : 1.00   Min.   : 0.0  
##  1st Qu.: 3.000   Class :character   1st Qu.: 8.00   1st Qu.: 5.0  
##  Median : 6.000   Mode  :character   Median :16.00   Median :12.0  
##  Mean   : 6.394                      Mean   :15.62   Mean   :11.5  
##  3rd Qu.: 9.000                      3rd Qu.:23.00   3rd Qu.:18.0  
##  Max.   :12.000                      Max.   :31.00   Max.   :23.0  
##      minute    
##  Min.   : 0.0  
##  1st Qu.:15.0  
##  Median :30.0  
##  Mean   :29.5  
##  3rd Qu.:44.0  
##  Max.   :59.0
 describe(dataByYears) #from the psych package
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
##                vars       n    mean    sd  min  max range   se
## DateTime          1 2027288     NaN    NA  Inf -Inf  -Inf   NA
## Date              2 2027288     NaN    NA  Inf -Inf  -Inf   NA
## Time              3 2027288     NaN    NA  Inf -Inf  -Inf   NA
## Sub_metering_1    4 2027288    1.12  6.15    0   88    88 0.00
## Sub_metering_2    5 2027288    1.29  5.79    0   80    80 0.00
## Sub_metering_3    6 2027288    6.45  8.43    0   31    31 0.01
## year              7 2027288 2008.45  1.10 2007 2010     3 0.00
## month             8 2027288    6.39  3.39    1   12    11 0.00
## weekday           9 2027288     NaN    NA  Inf -Inf  -Inf   NA
## day              10 2027288   15.62  8.80    1   31    30 0.01
## hour             11 2027288   11.50  6.92    0   23    23 0.00
## minute           12 2027288   29.50 17.32    0   59    59 0.01
#QQNorm
qqnorm(dataByYears$Sub_metering_1)

qqnorm(dataByYears$Sub_metering_2)

qqnorm(dataByYears$Sub_metering_3)

 #Look for NAs
 summary(is.na(dataByYears))
##   DateTime          Date            Time         Sub_metering_1 
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:2027288   FALSE:2027288   FALSE:2027288   FALSE:2027288  
##  Sub_metering_2  Sub_metering_3     year           month        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:2027288   FALSE:2027288   FALSE:2027288   FALSE:2027288  
##   weekday           day             hour           minute       
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:2027288   FALSE:2027288   FALSE:2027288   FALSE:2027288
 #Rename
 #dataByYears %>% rename( Kitchen = Sub_metering_1, Laundry = Sub_metering_2, AC_Heater = Sub_metering_3) dplyer

names(dataByYears)[names(dataByYears) == "Sub_metering_1"] <- "Kitchen"
names(dataByYears)[names(dataByYears) == "Sub_metering_2"] <- "Laundry"
names(dataByYears)[names(dataByYears) == "Sub_metering_3"] <- "AC_Heater"
 
 #GATHERING ALL SUB_METERINGS
 sub_meterings_All <- dataByYears %>% gather(Sub_metering, Value, Kitchen:AC_Heater)
 
 #VISUALIZATIONS. First approach
 
 #1Var
 #One Sub_metering at a time
 
 #Histograms
 hist(dataByYears$Kitchen)

 hist(dataByYears$Laundry)

 hist(dataByYears$AC_Heater)

 #BoxPlots
 boxplot(Kitchen~year,data=dataByYears) 

 boxplot(Laundry~year,data=dataByYears) 

 boxplot(AC_Heater~year,data=dataByYears) 

 #2Vars
 #Sub_meterings and Years
 
 #Boxplots
 ggplot(sub_meterings_All, aes(Sub_metering, Value )) + geom_boxplot() + coord_flip() + facet_grid(.~year)

 #Histograms
 ggplot(sub_meterings_All, aes(Value, fill = Sub_metering)) +  geom_histogram(alpha = 0.5, aes(y = ..density..), position = 'identity')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 #Density
 ggplot(sub_meterings_All, aes(Value, fill = Sub_metering)) + geom_density(alpha = 0.2)

 #qqPlot(lm(prestige ~ income + education + type, data=Duncan),envelope=.99)
 
 
 #EXPLORATION OF THE DATA. Second approach
 
 Values_n <-  sub_meterings_All %>% group_by(Value) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
 
 Values_not_0 <-  sub_meterings_All %>% group_by(Value) %>% filter(Value!=0)
 Values_not_0_n <-  Values_not_0 %>% group_by(Value) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
 Values_not_0_1 <-  sub_meterings_All %>% group_by(Value) %>% filter(Value>1)
 Values_not_0_1_n <-  Values_not_0_1 %>% group_by(Value) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
 Values_not_0_1_n
## # A tibble: 86 x 3
##    Value      n    freq
##    <dbl>  <int>   <dbl>
##  1     2 176481 0.171  
##  2     3   9505 0.00922
##  3     4   7988 0.00775
##  4     5   6562 0.00637
##  5     6   3812 0.00370
##  6     7   3080 0.00299
##  7     8   2782 0.00270
##  8     9   2901 0.00282
##  9    10   4215 0.00409
## 10    11  12128 0.0118 
## # … with 76 more rows
#VISUALIZATIONS. Second approach
#2Vars
 #Sub_meterings and Years
 
 #Boxplots
 ggplot(Values_not_0, aes(Sub_metering, Value )) + geom_boxplot() + coord_flip() + facet_grid(.~year)

 #Histograms
 ggplot(Values_not_0, aes(Value, fill = Sub_metering)) +  geom_histogram(alpha = 0.5, aes(y = ..density..), position = 'identity')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 #Density
 ggplot(Values_not_0, aes(Value, fill = Sub_metering)) + geom_density(alpha = 0.2)

 #VISUALIZATIONS. Third approach
#2Vars
 #Sub_meterings and Years
 
 #Boxplots
 ggplot(Values_not_0_1, aes(Sub_metering, Value )) + geom_boxplot() + coord_flip() + facet_grid(.~year)

 #Histograms
 ggplot_0_50_non_0_1 <-  ggplot(Values_not_0_1, aes(Value, fill = Sub_metering)) +  geom_histogram(bin=50, stat="count") 
## Warning: Ignoring unknown parameters: binwidth, bins, pad, bin
 ggplot_0_50_non_0_1 %>%  ggplotly()
 #convert as factor?
 Values_not_0_1$Value = as.factor(Values_not_0_1$Value)
 
 #Density
 ggplot(Values_not_0_1, aes(Value, fill = Sub_metering)) + geom_density(alpha = 0.2)

 #remove scientific notation in r
 options(scipen=999)
 #scale x breaks, xlim
 
 
 
 #PEAKS
 
 #I found that there is a peak in AC_Heater in 18 watts/hour. Also 17 and 19 are hight, so that might correspond to the oscillation of watts in the turning on and off. 

 #